Data is extracted and cleaned using Python. Some transformations such as datetimes to days since baseline are also done in Python, and saved into a config spreadsheet.
R reads the cleaned data from the spreadsheet and uses this to: * Create a graph structure * Make the data into a JAGS-readable format
configsheets = excel_sheets(configpath)
for (sheet in configsheets) {
assign(sheet, read_excel(configpath, sheet))
}
regression_df = read_excel(regdatapath) %>%
mutate(date_numeric=ifelse(date_numeric>0, date_numeric, NA)) # remove dates before baseline
## Warning: package 'bindrcpp' was built under R version 3.4.4
today_numeric = (Sys.time() - base_datetime) %>% as.numeric()
# assign unique facility IDs
well_names = c(regression_df$well) %>% unique()
fp_names = c(well_fp_map$fp, fp_gen_map$fp,fp_constants$fp) %>% unique()
fluid_types = c('ip', 'lp', 'w')
gen_names = gen_constants$gen %>% unique() %>% sort()
ip_gen_names = paste(gen_names, 'ip', sep='_')
lp_gen_names = paste(gen_names, 'lp', sep='_')
w_gen_names = paste(gen_names, 'w', sep='_')
dummy_gen_names = c(ip_gen_names, lp_gen_names, w_gen_names) %>% sort()
all_names = c('DUMMY', well_names, fp_names, dummy_gen_names, gen_names)
ids = 1:length(all_names)
names(ids) = all_names
# replace names in data with IDs
regression_df = regression_df %>% mutate(well_id=ids[well]) %>% select(-well)
operating_conditions = operating_conditions %>% mutate(well_id=ids[well]) %>% rename(whp_pred=whp)
fp_constants = fp_constants %>% mutate(fp_id=ids[fp]) %>% select(-fp)
gen_constants = gen_constants %>% mutate(gen_id=ids[gen]) %>% select(-gen)
well_fp_map = well_fp_map %>% mutate(well_id=ids[well], fp_id=ids[fp]) %>% select(-c(well, fp))
fp_gen_map = fp_gen_map %>% mutate(fp_id=ids[fp], gen_ip_id=ids[gen_ip], gen_lp_id=ids[gen_lp], gen_w_id=ids[gen_w]) %>% select(-c(fp, gen_ip, gen_lp, gen_w))
# create connectivity matrix. i flows to j
# wells to FPs
v = matrix(0, nrow=length(ids), ncol=length(ids))
v[1,] = 1
for (i in 1:nrow(well_fp_map)) {
id_i = well_fp_map[[i, 1]]
id_j = well_fp_map[[i, 2]]
v[id_i, id_j] = 1
}
# send ip/lp/w flows to dummy gens
for (i in 1:nrow(fp_gen_map)) {
id_i = fp_gen_map[[i, 1]]
for (j in 2:ncol(fp_gen_map)) {
facility_j = names(ids)[fp_gen_map[[i, j]]]
facility_dummy_j = paste(facility_j, fluid_types[j-1], sep='_')
id_j = ids[facility_dummy_j]
if (!is.na(id_j)) {
v[id_i, id_j] = 1
}
}
}
# dummy gens to gens
for (i in 1:nrow(gen_constants)) {
id_j = gen_constants$gen_id[i]
facility_j = names(ids)[id_j]
for (fluid in fluid_types) {
facility_dummy_i = paste(facility_j, fluid, sep='_')
id_i = ids[facility_dummy_i]
v[id_i, id_j] = 1
}
}
# convert form
m = matrix(0, nrow=nrow(v), ncol=max(colSums(v)))
for (i in 1:nrow(v)) {
for (j in 1:ncol(v)) {
if (v[[i, j]]==1) {
m[j, sum(m[j,]>0)+1] = i
}
}
}
# generate coordinates
dummy_locs = data.frame(name='DUMMY', x=-0.1, y=0)
well_locs = data.frame(name=well_names, x=0, y=seq(1/(length(well_names)-1), 1, length.out=length(well_names)))
fp_locs = data.frame(name=fp_names, x=1, y=seq(0, 1, length.out=length(fp_names)))
gen_dummy_locs = data.frame(name=dummy_gen_names, x=2, y=seq(0, 1, length.out=length(dummy_gen_names)))
gen_locs = data.frame(name=gen_names, x=2.5, y=seq(1/11, 10/11, length.out=length(gen_names)))
locs = rbind(dummy_locs, well_locs, fp_locs, gen_dummy_locs, gen_locs)
locs$id = ids[locs$name]
locs = locs %>% arrange(id)
g = graph_from_adjacency_matrix(v) %>%
set_vertex_attr('label', value=as.vector(locs$name)) %>%
set_vertex_attr('x', value=as.vector(locs$x)) %>%
set_vertex_attr('y', value=as.vector(locs$y)) %>%
set_vertex_attr('label.degree', value=pi) %>%
set_vertex_attr('size', value=8) %>%
as.undirected()
par(mar=c(0,0,0,0))
plot.igraph(g, vertex.label.dist=3)
The dummy node is necessary because when indexing a subset of flows that go into a node, this subset cannot be empty. The dummy node has zero mass flowing out of it.
regression_list = regression_df %>% select(-c(date, h)) %>% as.list()
operating_conditions_list = operating_conditions %>% arrange(well_id) %>% select(whp_pred) %>% as.list()
fp_constants_list = as.list(fp_constants)
gen_constants_list = as.list(gen_constants %>% select(gen_id, factor))
facilities = data.frame(id=1:max(ids)) %>%
full_join(operating_conditions %>% rename(id=well_id) %>% select(-well), by='id') %>%
full_join(gen_constants %>% select(factor, id=gen_id), by='id') %>%
full_join(fp_constants %>% rename(id=fp_id), by='id') %>%
mutate(mf_pred=NA) %>%
mutate(n_inflows=colSums(v))
## Warning: Column `id` has different attributes on LHS and RHS of join
## Warning: Column `id` has different attributes on LHS and RHS of join
## Warning: Column `id` has different attributes on LHS and RHS of join
facilities_list = facilities %>% select(-id) %>% as.list()
well_ids = ids[well_names]
fp_ids = ids[fp_names]
ip_gen_ids = ids[ip_gen_names]
lp_gen_ids = ids[lp_gen_names]
w_gen_ids = ids[w_gen_names]
gen_ids = ids[gen_names]
# insert production curve predictions
prod = data.frame(whp_prod=seq(7, 12, length.out=10),
well_id_prod=9)
prod_list = prod %>% as.list()
data = c(regression_list, facilities_list, prod_list,
list(well_ids=well_ids, fp_ids=fp_ids, gen_ids=gen_ids,
ip_gen_ids=ip_gen_ids, lp_gen_ids=lp_gen_ids, w_gen_ids=w_gen_ids,
today_numeric=today_numeric, m=m, dummy=1))
code = "
model {
#######################################
# fit individual regressions to wells #
#######################################
for (i in 1:length(whp)) {
# elliptic
# mu2[i] <- beta_whp[well_id[i]] * whp[i]^2 + beta_date[well_id[i]] * date_numeric[i]^2
# mu[i] <- sqrt(max(mu2[i], 0)) + Intercept[well_id[i]]
# linear
mu[i] <- Intercept[well_id[i]] + beta_whp[well_id[i]] * whp[i] + beta_date[well_id[i]] * date_numeric[i]
mf[i] ~ dnorm(mu[i], tau[well_id[i]])
}
# HIERARCHICAL
# fills in for any missing wells
for (j in well_ids) {
Intercept[j] ~ dnorm(mu_Intercept, tau_Intercept)
beta_whp[j] ~ dnorm(mu_beta_whp, tau_beta_whp)
beta_date[j] ~ dnorm(mu_date, tau_date)
tau[j] ~ dgamma(1e-8, 1e-8)
}
# fill in any missing dates
for (i in 1:length(whp)) {
date_numeric[i] ~ dnorm(mu_date_numeric, tau_date_numeric)
}
mu_date_numeric ~ dnorm(0, 1e-8)
tau_date_numeric ~ dnorm(1e-8, 1e-8)
# set hyperparameters
mu_Intercept ~ dnorm(0, 1e-8)
mu_beta_whp ~ dnorm(0, 1e-12)
mu_date ~ dnorm(0, 1e-8)
tau_Intercept ~ dgamma(1e-8, 1e-8)
tau_beta_whp ~ dgamma(1e-8, 1e-8)
tau_date ~ dgamma(1e-8, 1e-8)
#####################################
# production curve for verification #
#####################################
for (i in 1:length(whp_prod)) {
# elliptic
# mf_prod2[i] <- beta_whp[well_id_prod[i]] * whp_prod[i]^2 + beta_date[well_id_prod[i]] * today_numeric^2
# mf_prod[i] <- sqrt(max(mf_prod2[i], 0)) + Intercept[well_id_prod[i]]
# linear
mf_prod[i] <- Intercept[well_id_prod[i]] + beta_whp[well_id_prod[i]] * whp_prod[i] + beta_date[well_id_prod[i]] * today_numeric
}
######################################################
# simple model to fill in missing enthalpy constants #
######################################################
for (i in fp_ids) {
# missing fp constants
hf_ip[i] ~ dgamma(param[1], param[7])
hg_ip[i] ~ dgamma(param[2], param[8])
hfg_ip[i] ~ dgamma(param[3], param[9])
hf_lp[i] ~ dgamma(param[4], param[10])
hg_lp[i] ~ dgamma(param[5], param[11])
hfg_lp[i] ~ dgamma(param[6], param[12])
}
for (i in c(1, well_ids)) { h[i] ~ dgamma(param[13], param[14]) } # missing well constants
for (i in 1:14) { param[i] ~ dgamma(1e-8, 1e-8) } # uniform priors
########################################
# make predictions (the stuff we want) #
########################################
mf_pred[dummy] <- 0 # dummy well
ip_sf[dummy] <- 0
lp_sf[dummy] <- 0
wf[dummy] <- 0
for (i in well_ids) {
# elliptic
# mf_pred2[i] <- beta_whp[i] * whp_pred[i]^2 + beta_date[i] * today_numeric^2
# mf_pred[i] <- sqrt(max(mf_pred2[i], 0)) + Intercept[i]
# linear
mf_pred[i] <- Intercept[i] + beta_whp[i] * whp_pred[i] + beta_date[i] * today_numeric
}
for (i in fp_ids) {
mf_pred[i] <- sum(mf_pred[m[i,1:n_inflows[i]]])
h[i] <- sum(mf_pred[m[i, 1:n_inflows[i]]] * h[m[i, 1:n_inflows[i]]]) / ifelse(mf_pred[i]!=0, mf_pred[i], 1)
ip_sf[i] <- (h[i] - hf_ip[i]) / hfg_ip[i] * mf_pred[i]
lp_sf[i] <- (hf_ip[i] - hf_lp[i]) / hfg_lp[i] * (mf_pred[i] - ip_sf[i])
total_sf[i] <- ip_sf[i] + lp_sf[i]
wf[i] <- total_sf[i]
}
# dummy gens and actual gens
for (i in ip_gen_ids) { mf_pred[i] <- sum(ip_sf[m[i, 1:n_inflows[i]]]) }
for (i in lp_gen_ids) { mf_pred[i] <- sum(lp_sf[m[i, 1:n_inflows[i]]]) }
for (i in w_gen_ids) { mf_pred[i] <- sum(wf[m[i, 1:n_inflows[i]]]) }
for (i in gen_ids) {
mf_pred[i] <- sum(mf_pred[m[i,1:n_inflows[i]]])
power[i] <- mf_pred[i] / mu_factor[i]
mu_factor[i] ~ dunif(0.95*factor[i], 1.05*factor[i]) # uncertainty from email
}
total_power <- sum(power[gen_ids])
}
"
vars = c(paste0('mf_pred[', 2:length(data$mf_pred), ']'),
paste0('power[', gen_ids, ']'),
paste0('h[', fp_ids, ']'),
paste0('mf_prod[', 1:length(data$whp_prod), ']'))
n_chains = 4
burn_in = 200
n_steps = 1000
model = jags.model(textConnection(code), data, n.chains=n_chains)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 1565
## Unobserved stochastic nodes: 146
## Total graph size: 6026
##
## Initializing model
update(model, burn_in)
out = coda.samples(model, n.iter=n_steps, variable.names=vars)
outmatrix = as.matrix(out)
outframe = as.data.frame(outmatrix) %>%
gather(key=facility, value=value) %>%
mutate(variable=gsub("\\[.*$", "", facility), facility=parse_number(facility))
outframe$facility = names(ids)[outframe$facility]
g1 = ggplot(outframe %>% filter(facility %in% well_names, variable=="mf_pred", value>0), aes(x=value, fill=facility)) +
geom_density(alpha=0.5, color=NA) + xlim(0, NA) +labs(x="Mass flow")
g2 = ggplot(outframe %>% filter(facility %in% fp_names, variable=="mf_pred", value>0), aes(x=value, fill=facility)) +
geom_density(alpha=0.5, color=NA) + xlim(0, NA) + labs(x="Mass flow")
g3 = ggplot(outframe %>% filter(facility %in% gen_names, variable=="mf_pred", value>0), aes(x=value, fill=facility)) +
geom_density(alpha=0.5, color=NA) + xlim(0, NA) + labs(x="Mass flow")
g4 = ggplot(outframe %>% filter(facility %in% gen_names, variable=="power", value>0), aes(x=value, fill=facility)) +
geom_density(alpha=0.5, color=NA) + xlim(0, NA) + labs(x="Power")
ggplotly(g1, tooltip=c('facility', 'value'))
ggplotly(g2, tooltip=c('facility', 'value'))
p3 = ggplotly(g3, tooltip=c('facility', 'value'))
p4 = ggplotly(g4, tooltip=c('facility', 'value'))
subplot(p3, p4)
prod = as.data.frame(outmatrix) %>%
select(contains('prod')) %>%
gather(key=facility, value=value) %>%
mutate(which=parse_number(facility)) %>%
mutate(whp=data$whp_prod[which]) %>%
rename(mf=value)
plotdata = regression_df %>%
filter(well_id==9) %>%
mutate(datetime = factor(as.Date(date))) %>%
group_by(datetime) %>%
filter(n()>2) %>%
mutate(mf2 = mf^2)
mylm = lm(mf2 ~ datetime * I(whp^2) + datetime +0, data=plotdata)
datetime = plotdata$datetime %>% unique()
whp = seq(0, 16, 0.01)
reglines = expand.grid(datetime=datetime, whp=whp)
reglines$mf2 <- unname(predict(mylm, reglines))
reglines$mf <- sqrt(reglines$mf2)
## Warning in sqrt(reglines$mf2): NaNs produced
reglines$date = as.POSIXct(reglines$datetime)
ggplot(prod, aes(x=whp, y=mf)) +
geom_jitter(width=0, alpha=0.01) +
geom_quantile() +
geom_line(data=reglines, aes(linetype=datetime, color=date)) +
geom_point(data=plotdata, aes(color=date)) +
labs(color="Date", linetype="Fitted curve") +
xlim(0, NA) + ylim(0, NA)
## Smoothing formula not specified. Using: y ~ x
## Warning: Removed 1599 rows containing missing values (geom_path).